/*
 * Javlov - a Java toolkit for reinforcement learning with multi-agent support.
 * 
 * Copyright (c) 2009 Matthijs Snel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package net.javlov.policy;

import java.util.List;

import net.javlov.Action;
import net.javlov.Actor;
import net.javlov.Policy;
import net.javlov.QFunction;
import net.javlov.State;

/**
 * Uses a softmax distribution over action probabilities to select an action. Thus, this
 * actor can only be used with discrete actions.
 * 
 * @author Matthijs Snel
 *
 */
public class SoftmaxActor implements Actor {

	/**
	 * The Q-function that will be used to store action selection probabilities.
	 */
	protected QFunction q;
	
	/**
	 * The last selected action; stored in order to be able to update it with the TD error.
	 */
	protected Action lastAction;
	
	/**
	 * Policy used to select actions from probabilities.
	 */
	protected Policy pi;
	
	/**
	 * Constructs an actor based on the provided Q-function and action pool. Note that
	 * the {@code QFunction} is here purely used as a storage medium; it will not store
	 * Q-values, but instead the probabilities of selecting each action.
	 * 
	 * The values stored
	 * in the Q-function do not necessarily need to sum to 1 since this actor uses a softmax
	 * distribution over the values, which guarantees that the resulting probabilities sum
	 * to 1 anyway. It is however recommended (if a tabular Q-function is used) to initialise
	 * the values to > 0, e.g. all 1 / (nr of actions).
	 * 
	 * @param q the Q-function that stores the action probabilities.
	 * @param actions the pool of available actions.
	 */
	public SoftmaxActor(QFunction q, List<? extends Action> actions) {
		this.q = q;
		pi = new SoftmaxPolicy(q, actions);
	}
	
	/**
	 * @inheritDoc
	 */
	@Override
	public <T> Action getAction(State<T> s) {
		Action a = pi.getAction(s);
		q.setLastAction(a);
		return a;
	}

	/**
	 * @inheritDoc
	 */
	@Override
	public double getLearnRate() {
		return q.getLearnRate();
	}

	/**
	 * @inheritDoc
	 */
	@Override
	public void init() {
		q.init();
	}

	/**
	 * @inheritDoc
	 */
	@Override
	public void reset() {
		q.reset();
	}

	/**
	 * @inheritDoc
	 */
	@Override
	public void setLearnRate(double alpha) {
		q.setLearnRate(alpha);
	}

	/**
	 * Adds the provided TD error, multiplied by the learning rate alpha,
	 * to the current probablity of the action that was selected last.
	 * The probabilities of selecting the other actions will be decreased
	 * such that the sum of all probabilities adds to 1.
	 * 
	 * This implementation uses the TD error directly to increase the probability of the
	 * last selected action, i.e.
	 * 
	 * {@code p(s,a) = p(s,a) + alpha*TDerr},
	 * 
	 * or if eligibility traces are used (simply pass the actor a "traced" q-function):
	 * 
	 * {@code p(s,a) = p(s,a) + alpha*TDerr*e(s,a).}
	 * 
	 * @param TDerr the TD error that will be used to update the probability of the last
	 * selected action.
	 */
	@Override
	public <T> void update(double TDerr) {
		q.update(TDerr);
	}
}
